url.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. from __future__ import absolute_import
  2. import re
  3. from collections import namedtuple
  4. from ..exceptions import LocationParseError
  5. from ..packages import six
  6. url_attrs = ["scheme", "auth", "host", "port", "path", "query", "fragment"]
  7. # We only want to normalize urls with an HTTP(S) scheme.
  8. # urllib3 infers URLs without a scheme (None) to be http.
  9. NORMALIZABLE_SCHEMES = ("http", "https", None)
  10. # Almost all of these patterns were derived from the
  11. # 'rfc3986' module: https://github.com/python-hyper/rfc3986
  12. PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}")
  13. SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")
  14. URI_RE = re.compile(
  15. r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"
  16. r"(?://([^\\/?#]*))?"
  17. r"([^?#]*)"
  18. r"(?:\?([^#]*))?"
  19. r"(?:#(.*))?$",
  20. re.UNICODE | re.DOTALL,
  21. )
  22. IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
  23. HEX_PAT = "[0-9A-Fa-f]{1,4}"
  24. LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=HEX_PAT, ipv4=IPV4_PAT)
  25. _subs = {"hex": HEX_PAT, "ls32": LS32_PAT}
  26. _variations = [
  27. # 6( h16 ":" ) ls32
  28. "(?:%(hex)s:){6}%(ls32)s",
  29. # "::" 5( h16 ":" ) ls32
  30. "::(?:%(hex)s:){5}%(ls32)s",
  31. # [ h16 ] "::" 4( h16 ":" ) ls32
  32. "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",
  33. # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
  34. "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",
  35. # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
  36. "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",
  37. # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
  38. "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",
  39. # [ *4( h16 ":" ) h16 ] "::" ls32
  40. "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",
  41. # [ *5( h16 ":" ) h16 ] "::" h16
  42. "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",
  43. # [ *6( h16 ":" ) h16 ] "::"
  44. "(?:(?:%(hex)s:){0,6}%(hex)s)?::",
  45. ]
  46. UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._!\-~"
  47. IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")"
  48. ZONE_ID_PAT = "(?:%25|%)(?:[" + UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+"
  49. IPV6_ADDRZ_PAT = r"\[" + IPV6_PAT + r"(?:" + ZONE_ID_PAT + r")?\]"
  50. REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"
  51. TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$")
  52. IPV4_RE = re.compile("^" + IPV4_PAT + "$")
  53. IPV6_RE = re.compile("^" + IPV6_PAT + "$")
  54. IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$")
  55. BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")
  56. ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")
  57. SUBAUTHORITY_PAT = (u"^(?:(.*)@)?(%s|%s|%s)(?::([0-9]{0,5}))?$") % (
  58. REG_NAME_PAT,
  59. IPV4_PAT,
  60. IPV6_ADDRZ_PAT,
  61. )
  62. SUBAUTHORITY_RE = re.compile(SUBAUTHORITY_PAT, re.UNICODE | re.DOTALL)
  63. UNRESERVED_CHARS = set(
  64. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"
  65. )
  66. SUB_DELIM_CHARS = set("!$&'()*+,;=")
  67. USERINFO_CHARS = UNRESERVED_CHARS | SUB_DELIM_CHARS | {":"}
  68. PATH_CHARS = USERINFO_CHARS | {"@", "/"}
  69. QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {"?"}
  70. class Url(namedtuple("Url", url_attrs)):
  71. """
  72. Data structure for representing an HTTP URL. Used as a return value for
  73. :func:`parse_url`. Both the scheme and host are normalized as they are
  74. both case-insensitive according to RFC 3986.
  75. """
  76. __slots__ = ()
  77. def __new__(
  78. cls,
  79. scheme=None,
  80. auth=None,
  81. host=None,
  82. port=None,
  83. path=None,
  84. query=None,
  85. fragment=None,
  86. ):
  87. if path and not path.startswith("/"):
  88. path = "/" + path
  89. if scheme is not None:
  90. scheme = scheme.lower()
  91. return super(Url, cls).__new__(
  92. cls, scheme, auth, host, port, path, query, fragment
  93. )
  94. @property
  95. def hostname(self):
  96. """For backwards-compatibility with urlparse. We're nice like that."""
  97. return self.host
  98. @property
  99. def request_uri(self):
  100. """Absolute path including the query string."""
  101. uri = self.path or "/"
  102. if self.query is not None:
  103. uri += "?" + self.query
  104. return uri
  105. @property
  106. def netloc(self):
  107. """Network location including host and port"""
  108. if self.port:
  109. return "%s:%d" % (self.host, self.port)
  110. return self.host
  111. @property
  112. def url(self):
  113. """
  114. Convert self into a url
  115. This function should more or less round-trip with :func:`.parse_url`. The
  116. returned url may not be exactly the same as the url inputted to
  117. :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
  118. with a blank port will have : removed).
  119. Example: ::
  120. >>> U = parse_url('http://google.com/mail/')
  121. >>> U.url
  122. 'http://google.com/mail/'
  123. >>> Url('http', 'username:password', 'host.com', 80,
  124. ... '/path', 'query', 'fragment').url
  125. 'http://username:password@host.com:80/path?query#fragment'
  126. """
  127. scheme, auth, host, port, path, query, fragment = self
  128. url = u""
  129. # We use "is not None" we want things to happen with empty strings (or 0 port)
  130. if scheme is not None:
  131. url += scheme + u"://"
  132. if auth is not None:
  133. url += auth + u"@"
  134. if host is not None:
  135. url += host
  136. if port is not None:
  137. url += u":" + str(port)
  138. if path is not None:
  139. url += path
  140. if query is not None:
  141. url += u"?" + query
  142. if fragment is not None:
  143. url += u"#" + fragment
  144. return url
  145. def __str__(self):
  146. return self.url
  147. def split_first(s, delims):
  148. """
  149. .. deprecated:: 1.25
  150. Given a string and an iterable of delimiters, split on the first found
  151. delimiter. Return two split parts and the matched delimiter.
  152. If not found, then the first part is the full input string.
  153. Example::
  154. >>> split_first('foo/bar?baz', '?/=')
  155. ('foo', 'bar?baz', '/')
  156. >>> split_first('foo/bar?baz', '123')
  157. ('foo/bar?baz', '', None)
  158. Scales linearly with number of delims. Not ideal for large number of delims.
  159. """
  160. min_idx = None
  161. min_delim = None
  162. for d in delims:
  163. idx = s.find(d)
  164. if idx < 0:
  165. continue
  166. if min_idx is None or idx < min_idx:
  167. min_idx = idx
  168. min_delim = d
  169. if min_idx is None or min_idx < 0:
  170. return s, "", None
  171. return s[:min_idx], s[min_idx + 1 :], min_delim
  172. def _encode_invalid_chars(component, allowed_chars, encoding="utf-8"):
  173. """Percent-encodes a URI component without reapplying
  174. onto an already percent-encoded component.
  175. """
  176. if component is None:
  177. return component
  178. component = six.ensure_text(component)
  179. # Normalize existing percent-encoded bytes.
  180. # Try to see if the component we're encoding is already percent-encoded
  181. # so we can skip all '%' characters but still encode all others.
  182. component, percent_encodings = PERCENT_RE.subn(
  183. lambda match: match.group(0).upper(), component
  184. )
  185. uri_bytes = component.encode("utf-8", "surrogatepass")
  186. is_percent_encoded = percent_encodings == uri_bytes.count(b"%")
  187. encoded_component = bytearray()
  188. for i in range(0, len(uri_bytes)):
  189. # Will return a single character bytestring on both Python 2 & 3
  190. byte = uri_bytes[i : i + 1]
  191. byte_ord = ord(byte)
  192. if (is_percent_encoded and byte == b"%") or (
  193. byte_ord < 128 and byte.decode() in allowed_chars
  194. ):
  195. encoded_component += byte
  196. continue
  197. encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper()))
  198. return encoded_component.decode(encoding)
  199. def _remove_path_dot_segments(path):
  200. # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
  201. segments = path.split("/") # Turn the path into a list of segments
  202. output = [] # Initialize the variable to use to store output
  203. for segment in segments:
  204. # '.' is the current directory, so ignore it, it is superfluous
  205. if segment == ".":
  206. continue
  207. # Anything other than '..', should be appended to the output
  208. elif segment != "..":
  209. output.append(segment)
  210. # In this case segment == '..', if we can, we should pop the last
  211. # element
  212. elif output:
  213. output.pop()
  214. # If the path starts with '/' and the output is empty or the first string
  215. # is non-empty
  216. if path.startswith("/") and (not output or output[0]):
  217. output.insert(0, "")
  218. # If the path starts with '/.' or '/..' ensure we add one more empty
  219. # string to add a trailing '/'
  220. if path.endswith(("/.", "/..")):
  221. output.append("")
  222. return "/".join(output)
  223. def _normalize_host(host, scheme):
  224. if host:
  225. if isinstance(host, six.binary_type):
  226. host = six.ensure_str(host)
  227. if scheme in NORMALIZABLE_SCHEMES:
  228. is_ipv6 = IPV6_ADDRZ_RE.match(host)
  229. if is_ipv6:
  230. match = ZONE_ID_RE.search(host)
  231. if match:
  232. start, end = match.span(1)
  233. zone_id = host[start:end]
  234. if zone_id.startswith("%25") and zone_id != "%25":
  235. zone_id = zone_id[3:]
  236. else:
  237. zone_id = zone_id[1:]
  238. zone_id = "%" + _encode_invalid_chars(zone_id, UNRESERVED_CHARS)
  239. return host[:start].lower() + zone_id + host[end:]
  240. else:
  241. return host.lower()
  242. elif not IPV4_RE.match(host):
  243. return six.ensure_str(
  244. b".".join([_idna_encode(label) for label in host.split(".")])
  245. )
  246. return host
  247. def _idna_encode(name):
  248. if name and any([ord(x) > 128 for x in name]):
  249. try:
  250. from pip._vendor import idna
  251. except ImportError:
  252. six.raise_from(
  253. LocationParseError("Unable to parse URL without the 'idna' module"),
  254. None,
  255. )
  256. try:
  257. return idna.encode(name.lower(), strict=True, std3_rules=True)
  258. except idna.IDNAError:
  259. six.raise_from(
  260. LocationParseError(u"Name '%s' is not a valid IDNA label" % name), None
  261. )
  262. return name.lower().encode("ascii")
  263. def _encode_target(target):
  264. """Percent-encodes a request target so that there are no invalid characters"""
  265. path, query = TARGET_RE.match(target).groups()
  266. target = _encode_invalid_chars(path, PATH_CHARS)
  267. query = _encode_invalid_chars(query, QUERY_CHARS)
  268. if query is not None:
  269. target += "?" + query
  270. return target
  271. def parse_url(url):
  272. """
  273. Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
  274. performed to parse incomplete urls. Fields not provided will be None.
  275. This parser is RFC 3986 compliant.
  276. The parser logic and helper functions are based heavily on
  277. work done in the ``rfc3986`` module.
  278. :param str url: URL to parse into a :class:`.Url` namedtuple.
  279. Partly backwards-compatible with :mod:`urlparse`.
  280. Example::
  281. >>> parse_url('http://google.com/mail/')
  282. Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
  283. >>> parse_url('google.com:80')
  284. Url(scheme=None, host='google.com', port=80, path=None, ...)
  285. >>> parse_url('/foo?bar')
  286. Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
  287. """
  288. if not url:
  289. # Empty
  290. return Url()
  291. source_url = url
  292. if not SCHEME_RE.search(url):
  293. url = "//" + url
  294. try:
  295. scheme, authority, path, query, fragment = URI_RE.match(url).groups()
  296. normalize_uri = scheme is None or scheme.lower() in NORMALIZABLE_SCHEMES
  297. if scheme:
  298. scheme = scheme.lower()
  299. if authority:
  300. auth, host, port = SUBAUTHORITY_RE.match(authority).groups()
  301. if auth and normalize_uri:
  302. auth = _encode_invalid_chars(auth, USERINFO_CHARS)
  303. if port == "":
  304. port = None
  305. else:
  306. auth, host, port = None, None, None
  307. if port is not None:
  308. port = int(port)
  309. if not (0 <= port <= 65535):
  310. raise LocationParseError(url)
  311. host = _normalize_host(host, scheme)
  312. if normalize_uri and path:
  313. path = _remove_path_dot_segments(path)
  314. path = _encode_invalid_chars(path, PATH_CHARS)
  315. if normalize_uri and query:
  316. query = _encode_invalid_chars(query, QUERY_CHARS)
  317. if normalize_uri and fragment:
  318. fragment = _encode_invalid_chars(fragment, FRAGMENT_CHARS)
  319. except (ValueError, AttributeError):
  320. return six.raise_from(LocationParseError(source_url), None)
  321. # For the sake of backwards compatibility we put empty
  322. # string values for path if there are any defined values
  323. # beyond the path in the URL.
  324. # TODO: Remove this when we break backwards compatibility.
  325. if not path:
  326. if query is not None or fragment is not None:
  327. path = ""
  328. else:
  329. path = None
  330. # Ensure that each part of the URL is a `str` for
  331. # backwards compatibility.
  332. if isinstance(url, six.text_type):
  333. ensure_func = six.ensure_text
  334. else:
  335. ensure_func = six.ensure_str
  336. def ensure_type(x):
  337. return x if x is None else ensure_func(x)
  338. return Url(
  339. scheme=ensure_type(scheme),
  340. auth=ensure_type(auth),
  341. host=ensure_type(host),
  342. port=port,
  343. path=ensure_type(path),
  344. query=ensure_type(query),
  345. fragment=ensure_type(fragment),
  346. )
  347. def get_host(url):
  348. """
  349. Deprecated. Use :func:`parse_url` instead.
  350. """
  351. p = parse_url(url)
  352. return p.scheme or "http", p.hostname, p.port